<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Stemming in situ | Elasticsearch: The Definitive Guide [2.x] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch: The Definitive Guide [2.x]">
<link rel="up" href="stemming.html" title="Reducing Words to Their Root Form">
<link rel="prev" href="controlling-stemming.html" title="Controlling Stemming">
<link rel="next" href="stopwords.html" title="Stopwords: Performance Versus Precision">
<meta name="DC.type" content="Learn/Docs/Legacy/Elasticsearch/Definitive Guide/2.x">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="2.x">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<p>
  <strong>WARNING</strong>: The 2.x versions of Elasticsearch have passed their
  <a href="https://www.elastic.co/support/eol">EOL dates</a>. If you are running
  a 2.x version, we strongly advise you to upgrade.
</p>
<p>
  This documentation is no longer maintained and may be removed. For the latest
  information, see the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html">current
  Elasticsearch documentation</a>.
</p>
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch: The Definitive Guide [2.x]</a></span>
»
<span class="breadcrumb-link"><a href="languages.html">Dealing with Human Language</a></span>
»
<span class="breadcrumb-link"><a href="stemming.html">Reducing Words to Their Root Form</a></span>
»
<span class="breadcrumb-node">Stemming in situ</span>
</div>
<div class="navheader">
<span class="prev">
<a href="controlling-stemming.html">« Controlling Stemming</a>
</span>
<span class="next">
<a href="stopwords.html">Stopwords: Performance Versus Precision »</a>
</span>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="stemming-in-situ"></a>Stemming in situ<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/230_Stemming/60_Stemming_in_situ.asciidoc">edit</a>
</h2>
</div></div></div>
<p>For the sake of completeness, we will finish this chapter by explaining how to
index stemmed words into the same field as unstemmed words. As an example,
analyzing the sentence <em>The quick foxes jumped</em> would produce the following
terms:</p>
<div class="pre_wrapper lang-text">
<pre class="programlisting prettyprint lang-text">Pos 1: (the)
Pos 2: (quick)
Pos 3: (foxes,fox) <a id="CO164-1"></a><i class="conum" data-value="1"></i>
Pos 4: (jumped,jump) <a id="CO164-2"></a><i class="conum" data-value="1"></i></pre>
</div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO164-1"><i class="conum" data-value="1"></i></a><a href="#CO164-2"></a></p>
</td>
<td align="left" valign="top">
<p>The stemmed and unstemmed forms occupy the same position.</p>
</td>
</tr>
</table>
</div>
<div class="warning admon">
<div class="icon"></div>
<div class="admon_content">
<p>Read <a class="xref" href="stemming-in-situ.html#stemming-in-situ-good-idea" title="Is Stemming in situ a Good Idea">Is Stemming in situ a Good Idea</a> before using this approach.</p>
</div>
</div>
<p>To achieve stemming <em>in situ</em>, we will use the
<a href="http://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-keyword-repeat-tokenfilter.html" class="ulink" target="_top"><code class="literal">keyword_repeat</code></a>
token filter, which, like the <code class="literal">keyword_marker</code> token filter (see
<a class="xref" href="controlling-stemming.html#preventing-stemming" title="Preventing Stemming">Preventing Stemming</a>), marks each term as a keyword to prevent the subsequent
stemmer from touching it.  However, it also repeats the term in the same
position, and this repeated term <span class="strong strong"><strong>is</strong></span> stemmed.</p>
<p>Using the <code class="literal">keyword_repeat</code> token filter alone would result in the following:</p>
<div class="pre_wrapper lang-text">
<pre class="programlisting prettyprint lang-text">Pos 1: (the,the) <a id="CO165-1"></a><i class="conum" data-value="1"></i>
Pos 2: (quick,quick) <a id="CO165-2"></a><i class="conum" data-value="1"></i>
Pos 3: (foxes,fox)
Pos 4: (jumped,jump)</pre>
</div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO165-1"><i class="conum" data-value="1"></i></a><a href="#CO165-2"></a></p>
</td>
<td align="left" valign="top">
<p>The stemmed and unstemmed forms are the same, and so are repeated
needlessly.</p>
</td>
</tr>
</table>
</div>
<p>To prevent the useless repetition of terms that are the same in their stemmed
and unstemmed forms, we add the
<a href="/guide/en/elasticsearch/reference/2.4/analysis-unique-tokenfilter.html" class="ulink" target="_top"><code class="literal">unique</code></a> token filter into the mix:</p>
<div class="pre_wrapper lang-json">
<pre class="programlisting prettyprint lang-json">PUT /my_index
{
  "settings": {
    "analysis": {
      "filter": {
        "unique_stem": {
          "type": "unique",
          "only_on_same_position": true <a id="CO166-1"></a><i class="conum" data-value="1"></i>
        }
      },
      "analyzer": {
        "in_situ": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "keyword_repeat", <a id="CO166-2"></a><i class="conum" data-value="2"></i>
            "porter_stem",
            "unique_stem" <a id="CO166-3"></a><i class="conum" data-value="3"></i>
          ]
        }
      }
    }
  }
}</pre>
</div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO166-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>The <code class="literal">unique</code> token filter is set to remove duplicate tokens
only when they occur in the same position.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO166-2"><i class="conum" data-value="2"></i></a></p>
</td>
<td align="left" valign="top">
<p>The <code class="literal">keyword_repeat</code> token filter must appear before the
stemmer.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO166-3"><i class="conum" data-value="3"></i></a></p>
</td>
<td align="left" valign="top">
<p>The <code class="literal">unique_stem</code> filter removes duplicate terms after the
stemmer has done its work.</p>
</td>
</tr>
</table>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="stemming-in-situ-good-idea"></a>Is Stemming in situ a Good Idea<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/230_Stemming/60_Stemming_in_situ.asciidoc">edit</a>
</h3>
</div></div></div>
<p>People like the idea of stemming <em>in situ</em>: “Why use an unstemmed field
<em>and</em> a stemmed field if I can just use one combined field?” But is it a
good idea? The answer is almost always no.  There are two problems.</p>
<p>The first is the inability to separate exact matches from inexact matches.  In
this chapter, we have seen that words with different meanings are often
conflated to the same stem word: <code class="literal">organs</code> and <code class="literal">organization</code> both stem to
<code class="literal">organ</code>.</p>
<p>In <a class="xref" href="using-language-analyzers.html" title="Using Language Analyzers">Using Language Analyzers</a>, we demonstrated how to combine a query on a
stemmed field (to increase recall) with a query on an unstemmed field (to
improve relevance).  When the stemmed and unstemmed fields are separate, the
contribution of each field can be tuned by boosting one field over another
(see <a class="xref" href="multi-query-strings.html#prioritising-clauses" title="Prioritizing Clauses">Prioritizing Clauses</a>).  If, instead, the stemmed and unstemmed forms
appear in the same field, there is no way to tune your search results.</p>
<p>The second issue has to do with how the relevance score is calculated.  In
<a class="xref" href="relevance-intro.html" title="What Is Relevance?">What Is Relevance?</a>, we explained that part of the calculation depends on the
<em>inverse document frequency</em> — how often a word appears in all the documents
in our index.  Using in situ stemming for a document that contains  the text
<code class="literal">jump jumped jumps</code> would result in these terms:</p>
<div class="pre_wrapper lang-text">
<pre class="programlisting prettyprint lang-text">Pos 1: (jump)
Pos 2: (jumped,jump)
Pos 3: (jumps,jump)</pre>
</div>
<p>While <code class="literal">jumped</code> and <code class="literal">jumps</code> appear once each and so would have the correct IDF,
<code class="literal">jump</code> appears three times, greatly reducing its value as a search term in
comparison with the unstemmed forms.</p>
<p>For these reasons, we recommend against using stemming in situ.</p>
</div>

</div>
<div class="navfooter">
<span class="prev">
<a href="controlling-stemming.html">« Controlling Stemming</a>
</span>
<span class="next">
<a href="stopwords.html">Stopwords: Performance Versus Precision »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
